Computational research group in Granada university and GENyO research Centre. Expertise and interests in:

Our solution

PREPROCESSING

  • Load data in phyloseq object
  • Agglomerate by species

Load data

# Load data
train <- pseq(inputdir = inputdir, subset = "train")
test <- pseq(inputdir = inputdir, subset = "scoring")

Agglomerating by Species

# Agglomerate by Species
train <- tax_glom(train, taxrank =  "Species")
train <- subset_taxa(train, Species != "s__")
test  <- tax_glom(test, taxrank = "Species")
test  <- subset_taxa(test, Species != "s__")

print(train)
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4868 taxa and 3615 samples ]
sample_data() Sample Data:       [ 3615 samples by 12 sample variables ]
tax_table()   Taxonomy Table:    [ 4868 taxa by 7 taxonomic ranks ]
print(test)
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4868 taxa and 1809 samples ]
sample_data() Sample Data:       [ 1809 samples by 10 sample variables ]
tax_table()   Taxonomy Table:    [ 4868 taxa by 7 taxonomic ranks ]

Microbiome-based feature engineering

  • Calculate disbiosis score
  • Calculate relative abundance of each Phylum
  • Calculate richness scores
  • Calculate co-abundance

Disbiosis score

# Relative abundance
train_relab <- transform_sample_counts(train, function(x) x / sum(x) )
test_relab  <- transform_sample_counts(test, function(x) x / sum(x) )

# Calculate disbiosis scores
train_filt <- subset_taxa(train_relab, Phylum %in% c("p__Firmicutes","p__Bacteroidetes"))
test_filt  <- subset_taxa(test_relab, Phylum %in% c("p__Firmicutes","p__Bacteroidetes"))

calculate_f_b_ratio <- function(data){
  f_tax <- subset_taxa(data, Phylum == "p__Firmicutes")
  b_tax <- subset_taxa(data, Phylum == "p__Bacteroidetes")
  
  f_abundance <- sample_sums(f_tax)
  b_abundance <- sample_sums(b_tax)
  return(f_abundance / b_abundance)
}

disbiosis_train <- calculate_f_b_ratio(train_filt)
disbiosis_test <- calculate_f_b_ratio(test_filt)
sample_data(train)$disbiosis <- disbiosis_train
sample_data(test)$disbiosis <- disbiosis_test

print(head(disbiosis_train))
 Simulated_328 Simulated_1644 Simulated_1710 Simulated_1732 Simulated_1727 Simulated_2196 
     1.1275385      0.0509096      0.4192907      0.4470816      0.8496063      1.4721705 

Relative abundance of phylos

# Relative Abundance of phylos 
train_ph <- tax_glom(train_relab, taxrank =  "Phylum")
test_ph <- tax_glom(test_relab, taxrank = "Phylum")

train_ph <- subset_taxa(train_ph, Domain %in% c("k__Archaea", "k__Bacteria"))
test_ph  <- subset_taxa(test_ph, Domain %in% c("k__Archaea", "k__Bacteria"))

train_phylos <- t(train_ph@otu_table@.Data)
colnames(train_phylos) <- tax_table(train_ph)[,2]

test_phylos <- t(test_ph@otu_table@.Data)
colnames(test_phylos) <- tax_table(test_ph)[,2]

dim(train_phylos)
[1] 3615   36
print(colnames(train_phylos))
Taxonomy Table:     [36 taxa by 1 taxonomic ranks]:
           Phylum                    
taxid_9    "p__Crenarchaeota"        
taxid_148  "p__Euryarchaeota"        
taxid_261  "p__Thaumarchaeota"       
taxid_295  "p__Acidobacteria"        
taxid_355  "p__Actinobacteria"       
taxid_1237 "p__Aquificae"            
taxid_1256 "p__Armatimonadetes"      
taxid_1281 "p__Bacteroidetes"        
taxid_1732 "p__Caldiserica"          
taxid_1733 "p__Calditrichaeota"      
taxid_1746 "p__Chlamydiae"           
taxid_1760 "p__Chlorobi"             
taxid_1787 "p__Chloroflexi"          
taxid_1798 "p__Chrysiogenetes"       
taxid_1802 "p__Cyanobacteria"        
taxid_1809 "p__Deferribacteres"      
taxid_1823 "p__Deinococcus-Thermus"  
taxid_1856 "p__Dictyoglomi"          
taxid_1858 "p__Elusimicrobia"        
taxid_1861 "p__Fibrobacteres"        
taxid_2764 "p__Firmicutes"           
taxid_2951 "p__Fusobacteria"         
taxid_2971 "p__Gemmatimonadetes"     
taxid_2974 "p__Ignavibacteriae"      
taxid_2976 "p__Kiritimatiellaeota"   
taxid_2977 "p__Lentisphaerae"        
taxid_2978 "p__Nitrospinae"          
taxid_2985 "p__Nitrospirae"          
taxid_2996 "p__Planctomycetes"       
taxid_4500 "p__Proteobacteria"       
taxid_5098 "p__Spirochaetes"         
taxid_5109 "p__Synergistetes"        
taxid_5179 "p__Tenericutes"          
taxid_5250 "p__Thermodesulfobacteria"
taxid_5272 "p__Thermotogae"          
taxid_5295 "p__Verrucomicrobia"      

Calculate richness

# Calculate richness
richness_train <- estimate_richness(
                        train,
                        split = TRUE,
                        measures = c("Shannon", "Observed",
                                     "Chao1", "Simpson",
                                     "InvSimpson", "Fisher"))

richness_test <- estimate_richness(
                        test,
                        split = TRUE,
                        measures = c("Shannon", "Observed",
                                     "Chao1", "Simpson",
                                     "InvSimpson", "Fisher"))

Calculate co-abundances

# Filter taxa by counts
train <- filter_taxa(train,
                     function(x) sum(x > 2) > (0.8 * length(x)), TRUE)

# Calculate co-abundances
coab_taxa <- co_abundances(train)
DT::datatable(coab_taxa)
# Caculate GSVA
scores <- get_scores(coab_taxa, train, test, method = "gsva")
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels

  |                                                                                      
  |                                                                                |   0%
  |                                                                                      
  |===                                                                             |   4%
  |                                                                                      
  |=======                                                                         |   8%
  |                                                                                      
  |==========                                                                      |  12%
  |                                                                                      
  |=============                                                                   |  17%
  |                                                                                      
  |=================                                                               |  21%
  |                                                                                      
  |====================                                                            |  25%
  |                                                                                      
  |=======================                                                         |  29%
  |                                                                                      
  |===========================                                                     |  33%
  |                                                                                      
  |==============================                                                  |  38%
  |                                                                                      
  |=================================                                               |  42%
  |                                                                                      
  |=====================================                                           |  46%
  |                                                                                      
  |========================================                                        |  50%
  |                                                                                      
  |===========================================                                     |  54%
  |                                                                                      
  |===============================================                                 |  58%
  |                                                                                      
  |==================================================                              |  62%
  |                                                                                      
  |=====================================================                           |  67%
  |                                                                                      
  |=========================================================                       |  71%
  |                                                                                      
  |============================================================                    |  75%
  |                                                                                      
  |===============================================================                 |  79%
  |                                                                                      
  |===================================================================             |  83%
  |                                                                                      
  |======================================================================          |  88%
  |                                                                                      
  |=========================================================================       |  92%
  |                                                                                      
  |=============================================================================   |  96%
  |                                                                                      
  |================================================================================| 100%

Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels

  |                                                                                      
  |                                                                                |   0%
  |                                                                                      
  |===                                                                             |   4%
  |                                                                                      
  |=======                                                                         |   8%
  |                                                                                      
  |==========                                                                      |  12%
  |                                                                                      
  |=============                                                                   |  17%
  |                                                                                      
  |=================                                                               |  21%
  |                                                                                      
  |====================                                                            |  25%
  |                                                                                      
  |=======================                                                         |  29%
  |                                                                                      
  |===========================                                                     |  33%
  |                                                                                      
  |==============================                                                  |  38%
  |                                                                                      
  |=================================                                               |  42%
  |                                                                                      
  |=====================================                                           |  46%
  |                                                                                      
  |========================================                                        |  50%
  |                                                                                      
  |===========================================                                     |  54%
  |                                                                                      
  |===============================================                                 |  58%
  |                                                                                      
  |==================================================                              |  62%
  |                                                                                      
  |=====================================================                           |  67%
  |                                                                                      
  |=========================================================                       |  71%
  |                                                                                      
  |============================================================                    |  75%
  |                                                                                      
  |===============================================================                 |  79%
  |                                                                                      
  |===================================================================             |  83%
  |                                                                                      
  |======================================================================          |  88%
  |                                                                                      
  |=========================================================================       |  92%
  |                                                                                      
  |=============================================================================   |  96%
  |                                                                                      
  |================================================================================| 100%
# Create train and test data
pheno_train <- sample_data(train)
x_train <- data.frame(pheno_train[, -c(8, 9)],
                      richness_train,
                      train_phylos,
                      scores$train)
y_train <- pheno_train[, c("Event_time", "Event")]

pheno_test <- sample_data(test)
x_test <- data.frame(pheno_test,
                     richness_test,
                     test_phylos,
                     scores$test)

Filtering samples

  • Remove NA values in train
  • Impute NA values in test
  • Remove negative survival times
  • Delete PrevalentHFAIL variable
# Check data  
# ======
stopifnot(ncol(x_train) == ncol(x_test))
stopifnot(colnames(x_train) == colnames(x_test))
stopifnot(nrow(x_train) == nrow(y_train))

# Creating train and test data
train <- cbind.data.frame(x_train, y_train)
test <- as.data.frame(x_test)

# What do we do with ...
# ======
# NA's values in train and test?
train <- train[complete.cases(train), ]
test <- missRanger(test, pmm.k = 10, seed = 153)

Missing value imputation by random forests

  Variables to impute:      Smoking, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol
  Variables used to impute: Age, BodyMassIndex, Smoking, BPTreatment, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Sex, disbiosis, Observed, Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, p__Crenarchaeota, p__Euryarchaeota, p__Thaumarchaeota, p__Acidobacteria, p__Actinobacteria, p__Aquificae, p__Armatimonadetes, p__Bacteroidetes, p__Caldiserica, p__Calditrichaeota, p__Chlamydiae, p__Chlorobi, p__Chloroflexi, p__Chrysiogenetes, p__Cyanobacteria, p__Deferribacteres, p__Deinococcus.Thermus, p__Dictyoglomi, p__Elusimicrobia, p__Fibrobacteres, p__Firmicutes, p__Fusobacteria, p__Gemmatimonadetes, p__Ignavibacteriae, p__Kiritimatiellaeota, p__Lentisphaerae, p__Nitrospinae, p__Nitrospirae, p__Planctomycetes, p__Proteobacteria, p__Spirochaetes, p__Synergistetes, p__Tenericutes, p__Thermodesulfobacteria, p__Thermotogae, p__Verrucomicrobia, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, cluster_20, cluster_21, cluster_22, cluster_23, cluster_24
iter 1: ......
iter 2: ......
iter 3: ......
iter 4: ......
# Negatives survival values in train and test?
train <- train[-which(train$Event_time < 0), ]
train$Event_time <- train$Event_time + 0.1
#test$Event_time[which(test$Event_time < 0)] <- 15

# Removing PrevalentHFAIL (only 0)
train <- train[, -grep("PrevalentHFAIL", colnames(train))]
test <- test[, -grep("PrevalentHFAIL", colnames(test))]

DT::datatable(train)
Warning in instance$preRenderHook(instance): It seems your data is too big for client-
side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/
server.html

Feature Selection with biospear

# Fit model
# ========
method = "lasso"
cvrts <- c("Age", "BodyMassIndex",
           "SystolicBP", "NonHDLcholesterol",
           "Sex", "disbiosis")

models <- fit_biospear(data = train,
                       biomarkers = setdiff(colnames(train),
                                            c(cvrts, colnames(y_train))),
                       surv = c("Event_time", "Event"),
                       cvrts = cvrts,
                       inter = FALSE,
                       methods = method)

Computing selection with method: lasso
print(models)
$lasso
                  Age         BodyMassIndex            SystolicBP     NonHDLcholesterol 
          0.044753515           0.011847561           0.002076959          -0.094485660 
                  Sex             disbiosis          PrevalentCHD            InvSimpson 
          0.032552521           0.005092198           0.050703722          -0.019780689 
   p__Armatimonadetes           p__Chlorobi p__Kiritimatiellaeota        p__Nitrospinae 
         -0.032408719          -0.028157328           0.008537367           0.075069928 
            cluster_5 
          0.097205879 

Build final model

# Build cox with all train data
# ====
selected_betas <- models$lasso
train <- train[, match(
  c("Event", "Event_time", names(selected_betas)),
  colnames(train))]

surv <- "Surv(Event_time, Event) ~"
f <- as.formula(paste0(surv, paste(names(selected_betas),collapse='+')))
model <- coxph(f, train)

Predict risk

# Risk Prediction
# ====
risk = function(model, newdata, time) {
  as.numeric(1-summary(
    survfit(model, newdata = newdata, se.fit = F, conf.int = F), 
    times = time, extend = TRUE)$surv)
}

pred <- risk(model, test, time = 15)

Final scores

# Save scores
scores <- data.frame(
    SampleID = rownames(test),
    Score = pred
)
print(head(scores))
        SampleID      Score
1 Simulated_2211 0.03586051
2 Simulated_1629 0.02333127
3 Simulated_1690 0.01572205
4 Simulated_1367 0.03958909
5 Simulated_3387 0.02845292
6 Simulated_1746 0.04254048
#write.csv(scores, quote = FALSE, row.names = FALSE,
#          file = file.path(outputdir, "scores.csv"))

Run time

end <- Sys.time()
time <- difftime(end, start, units = "mins")
print(time)
Time difference of 12.9126 mins